{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "6789eab6-9237-41f7-8323-9f0cd0d6e3d2",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "============GPU================\n",
      "Sat May 11 00:37:03 2024       \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| NVIDIA-SMI 550.54.14              Driver Version: 550.54.14      CUDA Version: 12.4     |\n",
      "|-----------------------------------------+------------------------+----------------------+\n",
      "| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |\n",
      "| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |\n",
      "|                                         |                        |               MIG M. |\n",
      "|=========================================+========================+======================|\n",
      "|   0  NVIDIA GeForce RTX 3090        On  |   00000000:42:00.0 Off |                  N/A |\n",
      "| 30%   49C    P8             29W /  230W |       1MiB /  24576MiB |      0%      Default |\n",
      "|                                         |                        |                  N/A |\n",
      "+-----------------------------------------+------------------------+----------------------+\n",
      "                                                                                         \n",
      "+-----------------------------------------------------------------------------------------+\n",
      "| Processes:                                                                              |\n",
      "|  GPU   GI   CI        PID   Type   Process name                              GPU Memory |\n",
      "|        ID   ID                                                               Usage      |\n",
      "|=========================================================================================|\n",
      "|  No running processes found                                                             |\n",
      "+-----------------------------------------------------------------------------------------+\n",
      "============CUDA version================\n",
      "nvcc: NVIDIA (R) Cuda compiler driver\n",
      "Copyright (c) 2005-2023 NVIDIA Corporation\n",
      "Built on Mon_Apr__3_17:16:06_PDT_2023\n",
      "Cuda compilation tools, release 12.1, V12.1.105\n",
      "Build cuda_12.1.r12.1/compiler.32688072_0\n",
      "============CPU================\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "model name\t: AMD EPYC 7702P 64-Core Processor\n",
      "============Memory================\n",
      "MemTotal:       197681312 kB\n"
     ]
    }
   ],
   "source": [
    "# GPU\n",
    "print(\"============GPU================\")\n",
    "!nvidia-smi\n",
    "\n",
    "# CUDA version\n",
    "print(\"============CUDA version================\")\n",
    "!nvcc --version\n",
    "\n",
    "# CPU\n",
    "print(\"============CPU================\")\n",
    "!cat /proc/cpuinfo | grep model\\ name\n",
    "\n",
    "# Memory\n",
    "print(\"============Memory================\")\n",
    "!cat /proc/meminfo | grep MemTotal"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "b9583f02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Cloning into 'llama.cpp'...\n",
      "remote: Enumerating objects: 24318, done.\u001b[K\n",
      "remote: Counting objects: 100% (10090/10090), done.\u001b[K\n",
      "remote: Compressing objects: 100% (642/642), done.\u001b[K\n",
      "remote: Total 24318 (delta 9795), reused 9510 (delta 9448), pack-reused 14228\u001b[K\n",
      "Receiving objects: 100% (24318/24318), 45.23 MiB | 27.63 MiB/s, done.\n",
      "Resolving deltas: 100% (17282/17282), done.\n",
      "/workspace/llama.cpp\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/usr/local/lib/python3.10/dist-packages/IPython/core/magics/osm.py:417: UserWarning: using dhist requires you to install the `pickleshare` library.\n",
      "  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]\n"
     ]
    }
   ],
   "source": [
    "# Get the code for the first time use\n",
    "!git clone https://github.com/ggerganov/llama.cpp\n",
    "%cd llama.cpp"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "4f651103-a6c8-4b7d-9f0f-be0553b22acf",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml-vocab-aquila.gguf\t\t    ggml-vocab-llama-bpe.gguf\n",
      "ggml-vocab-baichuan.gguf\t    ggml-vocab-llama-bpe.gguf.inp\n",
      "ggml-vocab-bert-bge.gguf\t    ggml-vocab-llama-bpe.gguf.out\n",
      "ggml-vocab-bert-bge.gguf.inp\t    ggml-vocab-llama-spm.gguf\n",
      "ggml-vocab-bert-bge.gguf.out\t    ggml-vocab-llama-spm.gguf.inp\n",
      "ggml-vocab-command-r.gguf\t    ggml-vocab-llama-spm.gguf.out\n",
      "ggml-vocab-command-r.gguf.inp\t    ggml-vocab-mpt.gguf\n",
      "ggml-vocab-command-r.gguf.out\t    ggml-vocab-mpt.gguf.inp\n",
      "ggml-vocab-deepseek-coder.gguf\t    ggml-vocab-mpt.gguf.out\n",
      "ggml-vocab-deepseek-coder.gguf.inp  ggml-vocab-phi-3.gguf\n",
      "ggml-vocab-deepseek-coder.gguf.out  ggml-vocab-phi-3.gguf.inp\n",
      "ggml-vocab-deepseek-llm.gguf\t    ggml-vocab-phi-3.gguf.out\n",
      "ggml-vocab-deepseek-llm.gguf.inp    ggml-vocab-qwen2.gguf\n",
      "ggml-vocab-deepseek-llm.gguf.out    ggml-vocab-qwen2.gguf.inp\n",
      "ggml-vocab-falcon.gguf\t\t    ggml-vocab-qwen2.gguf.out\n",
      "ggml-vocab-falcon.gguf.inp\t    ggml-vocab-refact.gguf\n",
      "ggml-vocab-falcon.gguf.out\t    ggml-vocab-refact.gguf.inp\n",
      "ggml-vocab-gpt-2.gguf\t\t    ggml-vocab-refact.gguf.out\n",
      "ggml-vocab-gpt-2.gguf.inp\t    ggml-vocab-stablelm.gguf\n",
      "ggml-vocab-gpt-2.gguf.out\t    ggml-vocab-starcoder.gguf\n",
      "ggml-vocab-gpt-neox.gguf\t    ggml-vocab-starcoder.gguf.inp\n",
      "ggml-vocab-gpt2.gguf\t\t    ggml-vocab-starcoder.gguf.out\n"
     ]
    }
   ],
   "source": [
    "# Obtain the original LLaMA model weights and place them in ./models\n",
    "!ls ./models"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "58f434b3-1493-448d-95d3-8954c1aa1267",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Get:1 http://security.ubuntu.com/ubuntu jammy-security InRelease [110 kB]\n",
      "Get:2 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease [1581 B]\n",
      "Get:3 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease [18.1 kB]3m\n",
      "Get:4 http://archive.ubuntu.com/ubuntu jammy InRelease [270 kB][33m\n",
      "Get:5 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  Packages [830 kB]\n",
      "Get:6 http://security.ubuntu.com/ubuntu jammy-security/multiverse amd64 Packages [44.7 kB]\n",
      "Get:7 http://security.ubuntu.com/ubuntu jammy-security/restricted amd64 Packages [2308 kB]\n",
      "Get:8 http://archive.ubuntu.com/ubuntu jammy-updates InRelease [119 kB]33m\u001b[33m\n",
      "Get:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy/main amd64 Packages [27.8 kB]\n",
      "Get:10 http://archive.ubuntu.com/ubuntu jammy-backports InRelease [109 kB]    \u001b[0m\u001b[33mm\n",
      "Get:11 http://archive.ubuntu.com/ubuntu jammy/multiverse amd64 Packages [266 kB]\n",
      "Get:12 http://archive.ubuntu.com/ubuntu jammy/universe amd64 Packages [17.5 MB]\n",
      "Get:13 http://security.ubuntu.com/ubuntu jammy-security/main amd64 Packages [1798 kB]\n",
      "Get:14 http://security.ubuntu.com/ubuntu jammy-security/universe amd64 Packages [1082 kB]\n",
      "Get:15 http://archive.ubuntu.com/ubuntu jammy/restricted amd64 Packages [164 kB][0m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\n",
      "Get:16 http://archive.ubuntu.com/ubuntu jammy/main amd64 Packages [1792 kB]\n",
      "Get:17 http://archive.ubuntu.com/ubuntu jammy-updates/restricted amd64 Packages [2382 kB]\n",
      "Get:18 http://archive.ubuntu.com/ubuntu jammy-updates/multiverse amd64 Packages [51.1 kB]\n",
      "Get:19 http://archive.ubuntu.com/ubuntu jammy-updates/main amd64 Packages [2069 kB]\n",
      "Get:20 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 Packages [1374 kB]\n",
      "Get:21 http://archive.ubuntu.com/ubuntu jammy-backports/main amd64 Packages [81.0 kB]\n",
      "Get:22 http://archive.ubuntu.com/ubuntu jammy-backports/universe amd64 Packages [31.9 kB]\n",
      "Fetched 32.4 MB in 3s (11.5 MB/s)[33m                        \u001b[0m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\u001b[33m\n",
      "Reading package lists... Done\n",
      "Building dependency tree... Done\n",
      "Reading state information... Done\n",
      "59 packages can be upgraded. Run 'apt list --upgradable' to see them.\n",
      "Reading package lists... Done\n",
      "Building dependency tree... Done\n",
      "Reading state information... Done\n",
      "The following NEW packages will be installed:\n",
      "  git-lfs\n",
      "0 upgraded, 1 newly installed, 0 to remove and 59 not upgraded.\n",
      "Need to get 3503 kB of archives.\n",
      "After this operation, 10.4 MB of additional disk space will be used.\n",
      "Get:1 http://archive.ubuntu.com/ubuntu jammy-updates/universe amd64 git-lfs amd64 3.0.2-1ubuntu0.2 [3503 kB]\n",
      "Fetched 3503 kB in 1s (3566 kB/s) \u001b[0m33m\u001b[33m\n",
      "debconf: delaying package configuration, since apt-utils is not installed\n",
      "\n",
      "\u001b7\u001b[0;23r\u001b8\u001b[1ASelecting previously unselected package git-lfs.\n",
      "(Reading database ... 21038 files and directories currently installed.)\n",
      "Preparing to unpack .../git-lfs_3.0.2-1ubuntu0.2_amd64.deb ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [  0%]\u001b[49m\u001b[39m [..........................................................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 20%]\u001b[49m\u001b[39m [###########...............................................] \u001b8Unpacking git-lfs (3.0.2-1ubuntu0.2) ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 40%]\u001b[49m\u001b[39m [#######################...................................] \u001b8Setting up git-lfs (3.0.2-1ubuntu0.2) ...\n",
      "\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 60%]\u001b[49m\u001b[39m [##################################........................] \u001b8\u001b7\u001b[24;0f\u001b[42m\u001b[30mProgress: [ 80%]\u001b[49m\u001b[39m [##############################################............] \u001b8\n",
      "\u001b7\u001b[0;24r\u001b8\u001b[1A\u001b[J"
     ]
    }
   ],
   "source": [
    "# Get git-lfs to clone large files\n",
    "!apt update\n",
    "!apt install git-lfs"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7dfe726e-cc24-4a61-bdf4-e85eff079b47",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Generating public/private ed25519 key pair.\n",
      "Your identification has been saved in /root/.ssh/id_ed25519\n",
      "Your public key has been saved in /root/.ssh/id_ed25519.pub\n",
      "The key fingerprint is:\n",
      "SHA256:i3eVPxqYMNDDYrigawSxJBB1WRepReYuri3rgsnMJrQ your.email@example.com\n",
      "The key's randomart image is:\n",
      "+--[ED25519 256]--+\n",
      "|*+. .o..=o       |\n",
      "|oo ... *o        |\n",
      "|o . . +o=        |\n",
      "|.. . o.+ .   .   |\n",
      "|..  . . S   o    |\n",
      "|.o   . o + + .   |\n",
      "|*+.   o o + . o  |\n",
      "|=E. .o . .   o . |\n",
      "|o  o+o.     .    |\n",
      "+----[SHA256]-----+\n"
     ]
    }
   ],
   "source": [
    "# Generating a new SSH keypair\n",
    "!ssh-keygen -t ed25519 -C \"your.email@example.com\" -f \"/root/.ssh/id_ed25519\" -N \"\""
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "14a22530-47e3-435b-96d3-b1799713f86b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAILTtHWv2cy06fzwxV/gHIjlcYQFCKj+g+i2bNyxpZ8Ef your.email@example.com\n"
     ]
    }
   ],
   "source": [
    "# You will need to add your SSH public key to your huggingface.co account: https://huggingface.co/settings/keys\n",
    "!cat /root/.ssh/id_ed25519.pub"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "a3bafad2-77b7-4c39-b846-e3339baff61e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "# hf.co:22 SSH-2.0-Huggingface-SSHD\n",
      "# hf.co:22 SSH-2.0-Huggingface-SSHD\n",
      "# hf.co:22 SSH-2.0-Huggingface-SSHD\n",
      "# hf.co:22 SSH-2.0-Huggingface-SSHD\n",
      "# hf.co:22 SSH-2.0-Huggingface-SSHD\n",
      "Hi JaaackXD, welcome to Hugging Face.\n"
     ]
    }
   ],
   "source": [
    "# If you see a message with your username, congrats! Everything went well, you are ready to use git over SSH.\n",
    "!ssh-keyscan -H hf.co >> ~/.ssh/known_hosts\n",
    "!ssh -T git@hf.co"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "4e63016a",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/workspace/llama.cpp/models\n",
      "Updated git hooks.\n",
      "Git LFS initialized.\n",
      "Cloning into 'Meta-Llama-3-8B'...\n",
      "remote: Enumerating objects: 45, done.\u001b[K\n",
      "remote: Counting objects: 100% (18/18), done.\u001b[K\n",
      "remote: Compressing objects: 100% (15/15), done.\u001b[K\n",
      "remote: Total 45 (delta 11), reused 3 (delta 3), pack-reused 27 (from 1)\u001b[K\n",
      "Receiving objects: 100% (45/45), 4.49 MiB | 5.90 MiB/s, done.\n",
      "Resolving deltas: 100% (11/11), done.\n",
      "Updating files: 100% (17/17), done.\n",
      "Filtering content: 100% (6/6), 5.91 GiB | 22.00 MiB/s, done.\n",
      "Encountered 4 file(s) that may not have been copied correctly on Windows:\n",
      "\tmodel-00003-of-00004.safetensors\n",
      "\tmodel-00002-of-00004.safetensors\n",
      "\tmodel-00001-of-00004.safetensors\n",
      "\toriginal/consolidated.00.pth\n",
      "\n",
      "See: `git lfs help smudge` for more details.\n",
      "Cloning into 'Meta-Llama-3-70B'...\n",
      "remote: Enumerating objects: 129, done.\u001b[K\n",
      "remote: Counting objects: 100% (124/124), done.\u001b[K\n",
      "remote: Compressing objects: 100% (124/124), done.\u001b[K\n",
      "remote: Total 129 (delta 24), reused 0 (delta 0), pack-reused 5 (from 1)\u001b[K\n",
      "Receiving objects: 100% (129/129), 2.28 MiB | 4.10 MiB/s, done.\n",
      "Resolving deltas: 100% (24/24), done.\n",
      "Updating files: 100% (50/50), done.\n",
      "Filtering content: 100% (39/39), 18.85 GiB | 9.69 MiB/s, done.\n",
      "Encountered 37 file(s) that may not have been copied correctly on Windows:\n",
      "\toriginal/consolidated.00.pth\n",
      "\toriginal/consolidated.06.pth\n",
      "\toriginal/consolidated.05.pth\n",
      "\toriginal/consolidated.01.pth\n",
      "\toriginal/consolidated.02.pth\n",
      "\toriginal/consolidated.04.pth\n",
      "\toriginal/consolidated.07.pth\n",
      "\toriginal/consolidated.03.pth\n",
      "\tmodel-00018-of-00030.safetensors\n",
      "\tmodel-00008-of-00030.safetensors\n",
      "\tmodel-00028-of-00030.safetensors\n",
      "\tmodel-00013-of-00030.safetensors\n",
      "\tmodel-00003-of-00030.safetensors\n",
      "\tmodel-00029-of-00030.safetensors\n",
      "\tmodel-00014-of-00030.safetensors\n",
      "\tmodel-00024-of-00030.safetensors\n",
      "\tmodel-00011-of-00030.safetensors\n",
      "\tmodel-00009-of-00030.safetensors\n",
      "\tmodel-00021-of-00030.safetensors\n",
      "\tmodel-00016-of-00030.safetensors\n",
      "\tmodel-00006-of-00030.safetensors\n",
      "\tmodel-00019-of-00030.safetensors\n",
      "\tmodel-00004-of-00030.safetensors\n",
      "\tmodel-00007-of-00030.safetensors\n",
      "\tmodel-00026-of-00030.safetensors\n",
      "\tmodel-00022-of-00030.safetensors\n",
      "\tmodel-00027-of-00030.safetensors\n",
      "\tmodel-00017-of-00030.safetensors\n",
      "\tmodel-00012-of-00030.safetensors\n",
      "\tmodel-00002-of-00030.safetensors\n",
      "\tmodel-00015-of-00030.safetensors\n",
      "\tmodel-00005-of-00030.safetensors\n",
      "\tmodel-00001-of-00030.safetensors\n",
      "\tmodel-00020-of-00030.safetensors\n",
      "\tmodel-00025-of-00030.safetensors\n",
      "\tmodel-00010-of-00030.safetensors\n",
      "\tmodel-00023-of-00030.safetensors\n",
      "\n",
      "See: `git lfs help smudge` for more details.\n"
     ]
    }
   ],
   "source": [
    "# Clone/Download the model files from Meta HF repo: https://huggingface.co/meta-llama. If you are having issues with getting access, feel free to clone from my HF repo: https://huggingface.co/JaaackXD\n",
    "%cd models\n",
    "!git lfs install\n",
    "!git clone git@hf.co:meta-llama/Meta-Llama-3-8B\n",
    "!git clone git@hf.co:meta-llama/Meta-Llama-3-70B"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "603a64eb",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "/workspace/llama.cpp\n"
     ]
    }
   ],
   "source": [
    "!mkdir 8B-v3\n",
    "!mkdir 70B-v3\n",
    "%cd .."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "43e9d80e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "I ccache not found. Consider installing it for faster compilation.\n",
      "I llama.cpp build info: \n",
      "I UNAME_S:   Linux\n",
      "I UNAME_P:   x86_64\n",
      "I UNAME_M:   x86_64\n",
      "I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion \n",
      "I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE \n",
      "I NVCCFLAGS: -std=c++11 -O3 \n",
      "I LDFLAGS:    \n",
      "I CC:        cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I CXX:       c++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "\n",
      "rm -vrf *.o tests/*.o *.so *.a *.dll benchmark-matmult lookup-create lookup-merge lookup-stats common/build-info.cpp *.dot *.gcno tests/*.gcno *.gcda tests/*.gcda *.gcov tests/*.gcov lcov-report gcovr-report main quantize quantize-stats perplexity imatrix embedding vdot q8dot train-text-from-scratch convert-llama2c-to-ggml simple batched batched-bench save-load-state server gguf gguf-split eval-callback llama-bench libllava.a llava-cli baby-llama beam-search retrieval speculative infill tokenize benchmark-matmult parallel finetune export-lora lookahead lookup passkey gritlm tests/test-c.o tests/test-autorelease tests/test-backend-ops tests/test-double-float tests/test-grad0 tests/test-grammar-integration tests/test-grammar-parser tests/test-json-schema-to-grammar tests/test-llama-grammar tests/test-model-load-cancel tests/test-opt tests/test-quantize-fns tests/test-quantize-perf tests/test-rope tests/test-sampling tests/test-tokenizer-0 tests/test-tokenizer-1-bpe tests/test-tokenizer-1-spm\n",
      "rm -vrf ggml-cuda/*.o\n",
      "find examples pocs -type f -name \"*.o\" -delete\n",
      "I ccache not found. Consider installing it for faster compilation.\n",
      "I llama.cpp build info: \n",
      "I UNAME_S:   Linux\n",
      "I UNAME_P:   x86_64\n",
      "I UNAME_M:   x86_64\n",
      "I CFLAGS:    -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion \n",
      "I CXXFLAGS:  -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS \n",
      "I NVCCFLAGS: -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128 \n",
      "I LDFLAGS:   -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "I CC:        cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I CXX:       c++ (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0\n",
      "I NVCC:      Build cuda_12.1.r12.1/compiler.32688072_0\n",
      "\n",
      "!!!!\n",
      "LLAMA_CUBLAS is deprecated and will be removed in the future. Use LLAMA_CUDA instead.\n",
      "!!!!\n",
      "\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml.c -o ggml.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c llama.cpp -o llama.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/common.cpp -o common.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/sampling.cpp -o sampling.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/grammar-parser.cpp -o grammar-parser.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/json-schema-to-grammar.cpp -o json-schema-to-grammar.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/console.cpp -o console.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c sgemm.cpp -o sgemm.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda.cu -o ggml-cuda.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/acc.cu -o ggml-cuda/acc.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/alibi.cu -o ggml-cuda/alibi.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/arange.cu -o ggml-cuda/arange.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/argsort.cu -o ggml-cuda/argsort.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/binbcast.cu -o ggml-cuda/binbcast.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/clamp.cu -o ggml-cuda/clamp.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/concat.cu -o ggml-cuda/concat.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/convert.cu -o ggml-cuda/convert.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/cpy.cu -o ggml-cuda/cpy.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/diagmask.cu -o ggml-cuda/diagmask.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/dmmv.cu -o ggml-cuda/dmmv.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/fattn.cu -o ggml-cuda/fattn.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/getrows.cu -o ggml-cuda/getrows.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/im2col.cu -o ggml-cuda/im2col.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/mmq.cu -o ggml-cuda/mmq.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/mmvq.cu -o ggml-cuda/mmvq.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/norm.cu -o ggml-cuda/norm.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/pad.cu -o ggml-cuda/pad.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/pool2d.cu -o ggml-cuda/pool2d.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/quantize.cu -o ggml-cuda/quantize.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/rope.cu -o ggml-cuda/rope.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/scale.cu -o ggml-cuda/scale.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/softmax.cu -o ggml-cuda/softmax.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/sumrows.cu -o ggml-cuda/sumrows.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/tsembd.cu -o ggml-cuda/tsembd.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/unary.cu -o ggml-cuda/unary.o\n",
      "nvcc -std=c++11 -O3 -use_fast_math --forward-unknown-to-host-compiler -arch=native -DGGML_CUDA_DMMV_X=32 -DGGML_CUDA_MMV_Y=1 -DK_QUANTS_PER_ITERATION=2 -DGGML_CUDA_PEER_MAX_BATCH_SIZE=128  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -Xcompiler \"-std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -Wno-array-bounds -Wno-format-truncation -Wextra-semi -Wno-pedantic\" -c ggml-cuda/upscale.cu -o ggml-cuda/upscale.o\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml-alloc.c -o ggml-alloc.o\n",
      "cc  -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion    -c ggml-backend.c -o ggml-backend.o\n",
      "cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion     -c ggml-quants.c -o ggml-quants.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c unicode.cpp -o unicode.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c unicode-data.cpp -o unicode-data.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/train.cpp -o train.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/ngram-cache.cpp -o ngram-cache.o\n",
      "cc -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -std=c11   -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wshadow -Wstrict-prototypes -Wpointer-arith -Wmissing-prototypes -Werror=implicit-int -Werror=implicit-function-declaration -pthread -march=native -mtune=native -Wdouble-promotion  -c tests/test-c.c -o tests/test-c.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c common/build-info.cpp -o build-info.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/main/main.cpp -o examples/main/main.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/quantize/quantize.cpp -o examples/quantize/quantize.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/quantize-stats/quantize-stats.cpp -o examples/quantize-stats/quantize-stats.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/perplexity/perplexity.cpp -o examples/perplexity/perplexity.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/imatrix/imatrix.cpp -o examples/imatrix/imatrix.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/embedding/embedding.cpp -o examples/embedding/embedding.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c pocs/vdot/vdot.cpp -o pocs/vdot/vdot.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c pocs/vdot/q8dot.cpp -o pocs/vdot/q8dot.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/train-text-from-scratch/train-text-from-scratch.cpp -o examples/train-text-from-scratch/train-text-from-scratch.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.cpp -o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/simple/simple.cpp -o examples/simple/simple.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/batched/batched.cpp -o examples/batched/batched.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/batched-bench/batched-bench.cpp -o examples/batched-bench/batched-bench.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/save-load-state/save-load-state.cpp -o examples/save-load-state/save-load-state.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/server/server.cpp -o examples/server/server.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gguf/gguf.cpp -o examples/gguf/gguf.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gguf-split/gguf-split.cpp -o examples/gguf-split/gguf-split.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/eval-callback/eval-callback.cpp -o examples/eval-callback/eval-callback.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llama-bench/llama-bench.cpp -o examples/llama-bench/llama-bench.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -static -fPIC -c examples/llava/llava.cpp -o libllava.a -Wno-cast-qual\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/llava-cli.cpp -o examples/llava/llava-cli.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/baby-llama/baby-llama.cpp -o examples/baby-llama/baby-llama.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/beam-search/beam-search.cpp -o examples/beam-search/beam-search.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/retrieval/retrieval.cpp -o examples/retrieval/retrieval.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/speculative/speculative.cpp -o examples/speculative/speculative.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/infill/infill.cpp -o examples/infill/infill.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/tokenize/tokenize.cpp -o examples/tokenize/tokenize.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/benchmark/benchmark-matmult.cpp -o examples/benchmark/benchmark-matmult.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/parallel/parallel.cpp -o examples/parallel/parallel.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/finetune/finetune.cpp -o examples/finetune/finetune.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/export-lora/export-lora.cpp -o examples/export-lora/export-lora.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookahead/lookahead.cpp -o examples/lookahead/lookahead.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup.cpp -o examples/lookup/lookup.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/passkey/passkey.cpp -o examples/passkey/passkey.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/gritlm/gritlm.cpp -o examples/gritlm/gritlm.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o pocs/vdot/vdot.o -o vdot -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/baby-llama/baby-llama.o -o baby-llama -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gguf/gguf.o -o gguf -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/eval-callback/eval-callback.o -o eval-callback -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/embedding/embedding.o -o embedding -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o pocs/vdot/q8dot.o -o q8dot -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/tokenize/tokenize.o -o tokenize -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/benchmark/benchmark-matmult.o -o benchmark-matmult -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/quantize/quantize.o -o quantize -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/beam-search/beam-search.o -o beam-search -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/simple/simple.o -o simple -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gguf-split/gguf-split.o -o gguf-split -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/save-load-state/save-load-state.o -o save-load-state -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o llama.o common.o sampling.o grammar-parser.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/batched-bench/batched-bench.o -o batched-bench -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/gritlm/gritlm.o -o gritlm -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup.o -o lookup -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/batched/batched.o -o batched -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/train-text-from-scratch/train-text-from-scratch.o -o train-text-from-scratch -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/passkey/passkey.o -o passkey -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/export-lora/export-lora.o -o export-lora -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-create.cpp -o examples/lookup/lookup-create.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/clip.cpp  -o examples/llava/clip.o -Wno-cast-qual\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookahead/lookahead.o -o lookahead -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/parallel/parallel.o -o parallel -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o train.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/finetune/finetune.o -o finetune -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/convert-llama2c-to-ggml/convert-llama2c-to-ggml.o -o convert-llama2c-to-ggml -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/retrieval/retrieval.o -o retrieval -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/imatrix/imatrix.o -o imatrix -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o console.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/infill/infill.o -o infill -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/speculative/speculative.o -o speculative -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-create.o -o lookup-create -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-merge.cpp -o examples/lookup/lookup-merge.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o console.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/main/main.o -o main -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "\n",
      "====  Run ./main -h for help.  ====\n",
      "\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-merge.o -o lookup-merge -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/lookup/lookup-stats.cpp -o examples/lookup/lookup-stats.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/perplexity/perplexity.o -o perplexity -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  build-info.o ggml.o llama.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/quantize-stats/quantize-stats.o -o quantize-stats -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o ngram-cache.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/lookup/lookup-stats.o -o lookup-stats -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/llama-bench/llama-bench.o -o llama-bench -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  -c examples/llava/llava.cpp -o examples/llava/llava.o\n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o examples/llava/llava-cli.o examples/llava/clip.o examples/llava/llava.o -o llava-cli -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib \n",
      "c++ -std=c++11 -fPIC -O3 -Wall -Wextra -Wpedantic -Wcast-qual -Wno-unused-function -Wmissing-declarations -Wmissing-noreturn -pthread  -march=native -mtune=native -Wno-array-bounds -Wno-format-truncation -Wextra-semi -I. -Icommon -D_XOPEN_SOURCE=600 -D_GNU_SOURCE -DNDEBUG -DGGML_USE_LLAMAFILE -DGGML_USE_CUDA -I/usr/local/cuda/include -I/usr/local/cuda/targets/x86_64-linux/include -DGGML_CUDA_USE_GRAPHS  ggml.o llama.o common.o sampling.o grammar-parser.o build-info.o json-schema-to-grammar.o sgemm.o ggml-cuda.o ggml-cuda/acc.o ggml-cuda/alibi.o ggml-cuda/arange.o ggml-cuda/argsort.o ggml-cuda/binbcast.o ggml-cuda/clamp.o ggml-cuda/concat.o ggml-cuda/convert.o ggml-cuda/cpy.o ggml-cuda/diagmask.o ggml-cuda/dmmv.o ggml-cuda/fattn.o ggml-cuda/getrows.o ggml-cuda/im2col.o ggml-cuda/mmq.o ggml-cuda/mmvq.o ggml-cuda/norm.o ggml-cuda/pad.o ggml-cuda/pool2d.o ggml-cuda/quantize.o ggml-cuda/rope.o ggml-cuda/scale.o ggml-cuda/softmax.o ggml-cuda/sumrows.o ggml-cuda/tsembd.o ggml-cuda/unary.o ggml-cuda/upscale.o ggml-alloc.o ggml-backend.o ggml-quants.o unicode.o unicode-data.o -Iexamples/server examples/server/server.o -o server -lcuda -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/usr/lib64 -L/usr/local/cuda/targets/x86_64-linux/lib -L/usr/lib/wsl/lib  \n"
     ]
    }
   ],
   "source": [
    "# Build\n",
    "!make clean && LLAMA_CUBLAS=1 make -j"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "3ca0e26e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Collecting numpy~=1.24.4 (from -r ./requirements/requirements-convert.txt (line 1))\n",
      "  Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (5.6 kB)\n",
      "Collecting sentencepiece~=0.2.0 (from -r ./requirements/requirements-convert.txt (line 2))\n",
      "  Downloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.7 kB)\n",
      "Collecting transformers<5.0.0,>=4.40.1 (from -r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading transformers-4.40.2-py3-none-any.whl.metadata (137 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m138.0/138.0 kB\u001b[0m \u001b[31m2.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0ma \u001b[36m0:00:01\u001b[0m\n",
      "\u001b[?25hCollecting gguf>=0.1.0 (from -r ./requirements/requirements-convert.txt (line 4))\n",
      "  Downloading gguf-0.6.0-py3-none-any.whl.metadata (3.2 kB)\n",
      "Collecting protobuf<5.0.0,>=4.21.0 (from -r ./requirements/requirements-convert.txt (line 5))\n",
      "  Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl.metadata (541 bytes)\n",
      "Collecting torch~=2.1.1 (from -r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)\n",
      "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.13.1)\n",
      "Collecting huggingface-hub<1.0,>=0.19.3 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading huggingface_hub-0.23.0-py3-none-any.whl.metadata (12 kB)\n",
      "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (23.2)\n",
      "Requirement already satisfied: pyyaml>=5.1 in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (6.0.1)\n",
      "Collecting regex!=2019.12.17 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading regex-2024.5.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (40 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m40.9/40.9 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: requests in /usr/local/lib/python3.10/dist-packages (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2.31.0)\n",
      "Collecting tokenizers<0.20,>=0.19 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (6.7 kB)\n",
      "Collecting safetensors>=0.4.1 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.8 kB)\n",
      "Collecting tqdm>=4.27 (from transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3))\n",
      "  Downloading tqdm-4.66.4-py3-none-any.whl.metadata (57 kB)\n",
      "\u001b[2K     \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m57.6/57.6 kB\u001b[0m \u001b[31m7.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hRequirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (4.9.0)\n",
      "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.12)\n",
      "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.2.1)\n",
      "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (3.1.3)\n",
      "Requirement already satisfied: fsspec in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2024.2.0)\n",
      "Requirement already satisfied: nvidia-cuda-nvrtc-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-runtime-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cuda-cupti-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Requirement already satisfied: nvidia-cudnn-cu12==8.9.2.26 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (8.9.2.26)\n",
      "Requirement already satisfied: nvidia-cublas-cu12==12.1.3.1 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.3.1)\n",
      "Requirement already satisfied: nvidia-cufft-cu12==11.0.2.54 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (11.0.2.54)\n",
      "Requirement already satisfied: nvidia-curand-cu12==10.3.2.106 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (10.3.2.106)\n",
      "Requirement already satisfied: nvidia-cusolver-cu12==11.4.5.107 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (11.4.5.107)\n",
      "Requirement already satisfied: nvidia-cusparse-cu12==12.1.0.106 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.0.106)\n",
      "Collecting nvidia-nccl-cu12==2.18.1 (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl.metadata (1.8 kB)\n",
      "Requirement already satisfied: nvidia-nvtx-cu12==12.1.105 in /usr/local/lib/python3.10/dist-packages (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.1.105)\n",
      "Collecting triton==2.1.0 (from torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2))\n",
      "  Downloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (1.3 kB)\n",
      "Requirement already satisfied: nvidia-nvjitlink-cu12 in /usr/local/lib/python3.10/dist-packages (from nvidia-cusolver-cu12==11.4.5.107->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (12.3.101)\n",
      "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (2.1.5)\n",
      "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.3.2)\n",
      "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (3.6)\n",
      "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2.2.0)\n",
      "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests->transformers<5.0.0,>=4.40.1->-r ./requirements/requirements-convert.txt (line 3)) (2024.2.2)\n",
      "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch~=2.1.1->-r ./requirements/requirements-convert-hf-to-gguf.txt (line 2)) (1.3.0)\n",
      "Downloading numpy-1.24.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (17.3 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.3/17.3 MB\u001b[0m \u001b[31m80.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading sentencepiece-0.2.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m96.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading transformers-4.40.2-py3-none-any.whl (9.0 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m9.0/9.0 MB\u001b[0m \u001b[31m96.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading gguf-0.6.0-py3-none-any.whl (23 kB)\n",
      "Downloading protobuf-4.25.3-cp37-abi3-manylinux2014_x86_64.whl (294 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m294.6/294.6 kB\u001b[0m \u001b[31m45.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading torch-2.1.2-cp310-cp310-manylinux1_x86_64.whl (670.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m670.2/670.2 MB\u001b[0m \u001b[31m9.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m:00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading nvidia_nccl_cu12-2.18.1-py3-none-manylinux1_x86_64.whl (209.8 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m209.8/209.8 MB\u001b[0m \u001b[31m17.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading triton-2.1.0-0-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (89.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.2/89.2 MB\u001b[0m \u001b[31m28.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading huggingface_hub-0.23.0-py3-none-any.whl (401 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m401.2/401.2 kB\u001b[0m \u001b[31m35.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading regex-2024.5.10-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (774 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m774.1/774.1 kB\u001b[0m \u001b[31m56.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading safetensors-0.4.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.2/1.2 MB\u001b[0m \u001b[31m45.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hDownloading tokenizers-0.19.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.6 MB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.6/3.6 MB\u001b[0m \u001b[31m42.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m00:01\u001b[0m00:01\u001b[0m\n",
      "\u001b[?25hDownloading tqdm-4.66.4-py3-none-any.whl (78 kB)\n",
      "\u001b[2K   \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m78.3/78.3 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
      "\u001b[?25hInstalling collected packages: sentencepiece, triton, tqdm, safetensors, regex, protobuf, nvidia-nccl-cu12, numpy, huggingface-hub, gguf, torch, tokenizers, transformers\n",
      "  Attempting uninstall: triton\n",
      "    Found existing installation: triton 2.2.0\n",
      "    Uninstalling triton-2.2.0:\n",
      "      Successfully uninstalled triton-2.2.0\n",
      "  Attempting uninstall: nvidia-nccl-cu12\n",
      "    Found existing installation: nvidia-nccl-cu12 2.19.3\n",
      "    Uninstalling nvidia-nccl-cu12-2.19.3:\n",
      "      Successfully uninstalled nvidia-nccl-cu12-2.19.3\n",
      "  Attempting uninstall: numpy\n",
      "    Found existing installation: numpy 1.26.3\n",
      "    Uninstalling numpy-1.26.3:\n",
      "      Successfully uninstalled numpy-1.26.3\n",
      "  Attempting uninstall: torch\n",
      "    Found existing installation: torch 2.2.0\n",
      "    Uninstalling torch-2.2.0:\n",
      "      Successfully uninstalled torch-2.2.0\n",
      "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
      "torchaudio 2.2.0 requires torch==2.2.0, but you have torch 2.1.2 which is incompatible.\n",
      "torchvision 0.17.0 requires torch==2.2.0, but you have torch 2.1.2 which is incompatible.\u001b[0m\u001b[31m\n",
      "\u001b[0mSuccessfully installed gguf-0.6.0 huggingface-hub-0.23.0 numpy-1.24.4 nvidia-nccl-cu12-2.18.1 protobuf-4.25.3 regex-2024.5.10 safetensors-0.4.3 sentencepiece-0.2.0 tokenizers-0.19.1 torch-2.1.2 tqdm-4.66.4 transformers-4.40.2 triton-2.1.0\n",
      "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n",
      "\u001b[0mINFO:hf-to-gguf:Loading model: Meta-Llama-3-8B\n",
      "INFO:gguf.gguf_writer:gguf: This GGUF file is for Little Endian only\n",
      "INFO:hf-to-gguf:Set model parameters\n",
      "INFO:hf-to-gguf:gguf: context length = 8192\n",
      "INFO:hf-to-gguf:gguf: embedding length = 4096\n",
      "INFO:hf-to-gguf:gguf: feed forward length = 14336\n",
      "INFO:hf-to-gguf:gguf: head count = 32\n",
      "INFO:hf-to-gguf:gguf: key-value head count = 8\n",
      "INFO:hf-to-gguf:gguf: rope theta = 500000.0\n",
      "INFO:hf-to-gguf:gguf: rms norm epsilon = 1e-05\n",
      "INFO:hf-to-gguf:gguf: file type = 1\n",
      "INFO:hf-to-gguf:Set model tokenizer\n",
      "Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.\n",
      "INFO:gguf.vocab:Adding 280147 merge(s).\n",
      "INFO:gguf.vocab:Setting special token type bos to 128000\n",
      "INFO:gguf.vocab:Setting special token type eos to 128001\n",
      "INFO:hf-to-gguf:Exporting model to 'models/8B-v3/ggml-model-f16.gguf'\n",
      "INFO:hf-to-gguf:gguf: loading model weight map from 'model.safetensors.index.json'\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00001-of-00004.safetensors'\n",
      "INFO:hf-to-gguf:token_embd.weight,           torch.bfloat16 --> float16, shape = {4096, 128256}\n",
      "INFO:hf-to-gguf:blk.0.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.0.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.0.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.0.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.0.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.0.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.0.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.0.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.0.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.1.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.1.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.1.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.1.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.1.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.1.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.1.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.1.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.1.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.2.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.2.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.2.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.2.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.2.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.2.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.2.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.2.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.2.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.3.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.3.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.3.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.3.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.3.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.3.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.3.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.3.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.3.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.4.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.4.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.4.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.4.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.4.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.4.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.4.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.4.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.4.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.5.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.5.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.5.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.5.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.5.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.5.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.5.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.5.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.5.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.6.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.6.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.6.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.6.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.6.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.6.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.6.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.6.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.6.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.7.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.7.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.7.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.7.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.7.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.7.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.7.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.7.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.7.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.8.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.8.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.8.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.8.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.8.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.8.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.8.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.8.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.8.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00002-of-00004.safetensors'\n",
      "INFO:hf-to-gguf:blk.10.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.10.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.10.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.10.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.10.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.10.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.10.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.10.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.10.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.11.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.11.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.11.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.11.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.11.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.11.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.11.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.11.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.11.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.12.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.12.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.12.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.12.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.12.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.12.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.12.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.12.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.12.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.13.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.13.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.13.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.13.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.13.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.13.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.13.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.13.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.13.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.14.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.14.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.14.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.14.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.14.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.14.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.14.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.14.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.14.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.15.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.15.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.15.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.15.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.15.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.15.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.15.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.15.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.15.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.16.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.16.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.16.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.16.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.16.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.16.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.16.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.16.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.16.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.17.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.17.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.17.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.17.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.17.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.17.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.17.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.17.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.17.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.18.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.18.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.18.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.18.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.18.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.18.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.18.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.18.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.18.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.19.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.19.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.19.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.19.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.19.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.19.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.19.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.19.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.19.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.20.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.20.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.20.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.20.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.20.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.9.attn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.9.ffn_down.weight,       torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.9.ffn_gate.weight,       torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.9.ffn_up.weight,         torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.9.ffn_norm.weight,       torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.9.attn_k.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.9.attn_output.weight,    torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.9.attn_q.weight,         torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.9.attn_v.weight,         torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00003-of-00004.safetensors'\n",
      "INFO:hf-to-gguf:blk.20.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.20.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.20.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.20.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.21.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.21.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.21.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.21.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.21.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.21.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.21.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.21.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.21.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.22.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.22.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.22.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.22.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.22.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.22.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.22.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.22.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.22.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.23.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.23.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.23.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.23.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.23.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.23.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.23.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.23.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.23.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.24.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.24.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.24.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.24.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.24.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.24.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.24.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.24.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.24.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.25.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.25.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.25.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.25.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.25.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.25.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.25.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.25.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.25.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.26.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.26.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.26.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.26.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.26.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.26.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.26.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.26.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.26.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.27.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.27.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.27.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.27.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.27.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.27.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.27.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.27.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.27.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.28.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.28.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.28.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.28.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.28.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.28.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.28.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.28.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.28.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.29.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.29.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.29.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.29.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.29.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.29.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.29.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.29.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.29.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.30.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.30.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.30.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.30.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.30.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.30.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.30.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.30.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.30.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.31.ffn_gate.weight,      torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.31.ffn_up.weight,        torch.bfloat16 --> float16, shape = {4096, 14336}\n",
      "INFO:hf-to-gguf:blk.31.attn_k.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:blk.31.attn_output.weight,   torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.31.attn_q.weight,        torch.bfloat16 --> float16, shape = {4096, 4096}\n",
      "INFO:hf-to-gguf:blk.31.attn_v.weight,        torch.bfloat16 --> float16, shape = {4096, 1024}\n",
      "INFO:hf-to-gguf:gguf: loading model part 'model-00004-of-00004.safetensors'\n",
      "INFO:hf-to-gguf:output.weight,               torch.bfloat16 --> float16, shape = {4096, 128256}\n",
      "INFO:hf-to-gguf:blk.31.attn_norm.weight,     torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:blk.31.ffn_down.weight,      torch.bfloat16 --> float16, shape = {14336, 4096}\n",
      "INFO:hf-to-gguf:blk.31.ffn_norm.weight,      torch.bfloat16 --> float32, shape = {4096}\n",
      "INFO:hf-to-gguf:output_norm.weight,          torch.bfloat16 --> float32, shape = {4096}\n",
      "Writing: 100%|███████████████████████████| 16.1G/16.1G [01:22<00:00, 195Mbyte/s]\n",
      "INFO:hf-to-gguf:Model successfully exported to 'models/8B-v3/ggml-model-f16.gguf'\n",
      "main: build = 2843 (e8496488)\n",
      "main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu\n",
      "main: quantizing './models/8B-v3/ggml-model-f16.gguf' to './models/8B-v3/ggml-model-Q4_K_M.gguf' as Q4_K_M\n",
      "llama_model_loader: loaded meta data with 20 key-value pairs and 291 tensors from ./models/8B-v3/ggml-model-f16.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
      "llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B\n",
      "llama_model_loader: - kv   2:                          llama.block_count u32              = 32\n",
      "llama_model_loader: - kv   3:                       llama.context_length u32              = 8192\n",
      "llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096\n",
      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
      "llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8\n",
      "llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000\n",
      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv  10:                          general.file_type u32              = 1\n",
      "llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256\n",
      "llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128\n",
      "llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2\n",
      "llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe\n",
      "llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = [\"!\", \"\\\"\", \"#\", \"$\", \"%\", \"&\", \"'\", ...\n",
      "llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
      "llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = [\"Ġ Ġ\", \"Ġ ĠĠĠ\", \"ĠĠ ĠĠ\", \"...\n",
      "llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000\n",
      "llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128001\n",
      "llama_model_loader: - type  f32:   65 tensors\n",
      "llama_model_loader: - type  f16:  226 tensors\n",
      "[   1/ 291]                    token_embd.weight - [ 4096, 128256,     1,     1], type =    f16, converting to q4_K .. size =  1002.00 MiB ->   281.81 MiB\n",
      "[   2/ 291]               blk.0.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[   3/ 291]                blk.0.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[   4/ 291]                blk.0.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[   5/ 291]                  blk.0.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[   6/ 291]                blk.0.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[   7/ 291]                  blk.0.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[   8/ 291]             blk.0.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[   9/ 291]                  blk.0.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  10/ 291]                  blk.0.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[  11/ 291]               blk.1.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  12/ 291]                blk.1.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[  13/ 291]                blk.1.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  14/ 291]                  blk.1.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  15/ 291]                blk.1.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  16/ 291]                  blk.1.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  17/ 291]             blk.1.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  18/ 291]                  blk.1.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  19/ 291]                  blk.1.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[  20/ 291]               blk.2.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  21/ 291]                blk.2.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[  22/ 291]                blk.2.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  23/ 291]                  blk.2.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  24/ 291]                blk.2.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  25/ 291]                  blk.2.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  26/ 291]             blk.2.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  27/ 291]                  blk.2.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  28/ 291]                  blk.2.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[  29/ 291]               blk.3.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  30/ 291]                blk.3.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[  31/ 291]                blk.3.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  32/ 291]                  blk.3.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  33/ 291]                blk.3.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  34/ 291]                  blk.3.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  35/ 291]             blk.3.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  36/ 291]                  blk.3.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  37/ 291]                  blk.3.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[  38/ 291]               blk.4.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  39/ 291]                blk.4.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  40/ 291]                blk.4.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  41/ 291]                  blk.4.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  42/ 291]                blk.4.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  43/ 291]                  blk.4.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  44/ 291]             blk.4.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  45/ 291]                  blk.4.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  46/ 291]                  blk.4.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  47/ 291]               blk.5.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  48/ 291]                blk.5.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  49/ 291]                blk.5.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  50/ 291]                  blk.5.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  51/ 291]                blk.5.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  52/ 291]                  blk.5.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  53/ 291]             blk.5.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  54/ 291]                  blk.5.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  55/ 291]                  blk.5.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  56/ 291]               blk.6.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  57/ 291]                blk.6.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[  58/ 291]                blk.6.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  59/ 291]                  blk.6.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  60/ 291]                blk.6.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  61/ 291]                  blk.6.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  62/ 291]             blk.6.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  63/ 291]                  blk.6.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  64/ 291]                  blk.6.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[  65/ 291]               blk.7.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  66/ 291]                blk.7.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  67/ 291]                blk.7.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  68/ 291]                  blk.7.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  69/ 291]                blk.7.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  70/ 291]                  blk.7.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  71/ 291]             blk.7.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  72/ 291]                  blk.7.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  73/ 291]                  blk.7.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  74/ 291]               blk.8.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  75/ 291]                blk.8.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  76/ 291]                blk.8.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  77/ 291]                  blk.8.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  78/ 291]                blk.8.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  79/ 291]                  blk.8.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  80/ 291]             blk.8.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  81/ 291]                  blk.8.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  82/ 291]                  blk.8.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  83/ 291]              blk.10.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  84/ 291]               blk.10.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[  85/ 291]               blk.10.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  86/ 291]                 blk.10.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  87/ 291]               blk.10.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  88/ 291]                 blk.10.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  89/ 291]            blk.10.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  90/ 291]                 blk.10.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  91/ 291]                 blk.10.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[  92/ 291]              blk.11.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  93/ 291]               blk.11.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  94/ 291]               blk.11.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  95/ 291]                 blk.11.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[  96/ 291]               blk.11.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[  97/ 291]                 blk.11.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[  98/ 291]            blk.11.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[  99/ 291]                 blk.11.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 100/ 291]                 blk.11.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 101/ 291]              blk.12.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 102/ 291]               blk.12.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 103/ 291]               blk.12.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 104/ 291]                 blk.12.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 105/ 291]               blk.12.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 106/ 291]                 blk.12.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 107/ 291]            blk.12.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 108/ 291]                 blk.12.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 109/ 291]                 blk.12.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 110/ 291]              blk.13.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 111/ 291]               blk.13.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 112/ 291]               blk.13.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 113/ 291]                 blk.13.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 114/ 291]               blk.13.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 115/ 291]                 blk.13.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 116/ 291]            blk.13.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 117/ 291]                 blk.13.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 118/ 291]                 blk.13.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 119/ 291]              blk.14.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 120/ 291]               blk.14.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 121/ 291]               blk.14.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 122/ 291]                 blk.14.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 123/ 291]               blk.14.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 124/ 291]                 blk.14.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 125/ 291]            blk.14.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 126/ 291]                 blk.14.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 127/ 291]                 blk.14.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 128/ 291]              blk.15.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 129/ 291]               blk.15.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 130/ 291]               blk.15.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 131/ 291]                 blk.15.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 132/ 291]               blk.15.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 133/ 291]                 blk.15.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 134/ 291]            blk.15.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 135/ 291]                 blk.15.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 136/ 291]                 blk.15.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 137/ 291]              blk.16.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 138/ 291]               blk.16.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 139/ 291]               blk.16.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 140/ 291]                 blk.16.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 141/ 291]               blk.16.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 142/ 291]                 blk.16.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 143/ 291]            blk.16.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 144/ 291]                 blk.16.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 145/ 291]                 blk.16.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 146/ 291]              blk.17.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 147/ 291]               blk.17.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 148/ 291]               blk.17.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 149/ 291]                 blk.17.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 150/ 291]               blk.17.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 151/ 291]                 blk.17.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 152/ 291]            blk.17.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 153/ 291]                 blk.17.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 154/ 291]                 blk.17.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 155/ 291]              blk.18.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 156/ 291]               blk.18.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 157/ 291]               blk.18.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 158/ 291]                 blk.18.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 159/ 291]               blk.18.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 160/ 291]                 blk.18.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 161/ 291]            blk.18.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 162/ 291]                 blk.18.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 163/ 291]                 blk.18.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 164/ 291]              blk.19.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 165/ 291]               blk.19.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 166/ 291]               blk.19.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 167/ 291]                 blk.19.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 168/ 291]               blk.19.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 169/ 291]                 blk.19.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 170/ 291]            blk.19.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 171/ 291]                 blk.19.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 172/ 291]                 blk.19.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 173/ 291]               blk.20.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 174/ 291]                 blk.20.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 175/ 291]            blk.20.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 176/ 291]                 blk.20.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 177/ 291]                 blk.20.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 178/ 291]               blk.9.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 179/ 291]                blk.9.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 180/ 291]                blk.9.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 181/ 291]                  blk.9.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 182/ 291]                blk.9.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 183/ 291]                  blk.9.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 184/ 291]             blk.9.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 185/ 291]                  blk.9.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 186/ 291]                  blk.9.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 187/ 291]              blk.20.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 188/ 291]               blk.20.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 189/ 291]                 blk.20.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 190/ 291]               blk.20.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 191/ 291]              blk.21.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 192/ 291]               blk.21.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 193/ 291]               blk.21.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 194/ 291]                 blk.21.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 195/ 291]               blk.21.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 196/ 291]                 blk.21.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 197/ 291]            blk.21.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 198/ 291]                 blk.21.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 199/ 291]                 blk.21.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 200/ 291]              blk.22.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 201/ 291]               blk.22.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 202/ 291]               blk.22.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 203/ 291]                 blk.22.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 204/ 291]               blk.22.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 205/ 291]                 blk.22.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 206/ 291]            blk.22.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 207/ 291]                 blk.22.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 208/ 291]                 blk.22.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 209/ 291]              blk.23.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 210/ 291]               blk.23.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 211/ 291]               blk.23.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 212/ 291]                 blk.23.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 213/ 291]               blk.23.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 214/ 291]                 blk.23.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 215/ 291]            blk.23.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 216/ 291]                 blk.23.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 217/ 291]                 blk.23.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 218/ 291]              blk.24.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 219/ 291]               blk.24.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 220/ 291]               blk.24.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 221/ 291]                 blk.24.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 222/ 291]               blk.24.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 223/ 291]                 blk.24.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 224/ 291]            blk.24.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 225/ 291]                 blk.24.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 226/ 291]                 blk.24.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 227/ 291]              blk.25.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 228/ 291]               blk.25.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 229/ 291]               blk.25.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 230/ 291]                 blk.25.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 231/ 291]               blk.25.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 232/ 291]                 blk.25.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 233/ 291]            blk.25.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 234/ 291]                 blk.25.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 235/ 291]                 blk.25.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 236/ 291]              blk.26.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 237/ 291]               blk.26.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 238/ 291]               blk.26.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 239/ 291]                 blk.26.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 240/ 291]               blk.26.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 241/ 291]                 blk.26.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 242/ 291]            blk.26.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 243/ 291]                 blk.26.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 244/ 291]                 blk.26.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 245/ 291]              blk.27.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 246/ 291]               blk.27.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 247/ 291]               blk.27.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 248/ 291]                 blk.27.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 249/ 291]               blk.27.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 250/ 291]                 blk.27.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 251/ 291]            blk.27.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 252/ 291]                 blk.27.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 253/ 291]                 blk.27.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 254/ 291]              blk.28.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 255/ 291]               blk.28.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 256/ 291]               blk.28.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 257/ 291]                 blk.28.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 258/ 291]               blk.28.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 259/ 291]                 blk.28.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 260/ 291]            blk.28.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 261/ 291]                 blk.28.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 262/ 291]                 blk.28.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 263/ 291]              blk.29.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 264/ 291]               blk.29.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 265/ 291]               blk.29.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 266/ 291]                 blk.29.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 267/ 291]               blk.29.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 268/ 291]                 blk.29.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 269/ 291]            blk.29.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 270/ 291]                 blk.29.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 271/ 291]                 blk.29.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 272/ 291]              blk.30.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 273/ 291]               blk.30.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 274/ 291]               blk.30.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 275/ 291]                 blk.30.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 276/ 291]               blk.30.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 277/ 291]                 blk.30.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 278/ 291]            blk.30.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 279/ 291]                 blk.30.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 280/ 291]                 blk.30.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 281/ 291]               blk.31.ffn_gate.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 282/ 291]                 blk.31.ffn_up.weight - [ 4096, 14336,     1,     1], type =    f16, converting to q4_K .. size =   112.00 MiB ->    31.50 MiB\n",
      "[ 283/ 291]                 blk.31.attn_k.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q4_K .. size =     8.00 MiB ->     2.25 MiB\n",
      "[ 284/ 291]            blk.31.attn_output.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 285/ 291]                 blk.31.attn_q.weight - [ 4096,  4096,     1,     1], type =    f16, converting to q4_K .. size =    32.00 MiB ->     9.00 MiB\n",
      "[ 286/ 291]                 blk.31.attn_v.weight - [ 4096,  1024,     1,     1], type =    f16, converting to q6_K .. size =     8.00 MiB ->     3.28 MiB\n",
      "[ 287/ 291]                        output.weight - [ 4096, 128256,     1,     1], type =    f16, converting to q6_K .. size =  1002.00 MiB ->   410.98 MiB\n",
      "[ 288/ 291]              blk.31.attn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 289/ 291]               blk.31.ffn_down.weight - [14336,  4096,     1,     1], type =    f16, converting to q6_K .. size =   112.00 MiB ->    45.94 MiB\n",
      "[ 290/ 291]               blk.31.ffn_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "[ 291/ 291]                   output_norm.weight - [ 4096,     1,     1,     1], type =    f32, size =    0.016 MB\n",
      "llama_model_quantize_internal: model size  = 15317.02 MB\n",
      "llama_model_quantize_internal: quant size  =  4685.30 MB\n",
      "\n",
      "main: quantize time = 124129.97 ms\n",
      "main:    total time = 124129.97 ms\n"
     ]
    }
   ],
   "source": [
    "# Run the code if you would like to convert and quantize models by yourself\n",
    "# Install Python dependencies\n",
    "!python3 -m pip install -r requirements.txt\n",
    "\n",
    "# # Convert the HF models to ggml FP16 format (High RAM requirement!)\n",
    "!python3 convert-hf-to-gguf.py models/Meta-Llama-3-8B/ --outfile models/8B-v3/ggml-model-f16.gguf --outtype f16\n",
    "# !python3 convert-hf-to-gguf.py models/Meta-Llama-3-70B/ --outfile models/70B-v3/ggml-model-f16.gguf --outtype f16\n",
    "\n",
    "# # Quantize the model to 4-bits (using Q4_K_M method)\n",
    "!./quantize ./models/8B-v3/ggml-model-f16.gguf ./models/8B-v3/ggml-model-Q4_K_M.gguf Q4_K_M\n",
    "# !./quantize ./models/70B-v3/ggml-model-f16.gguf ./models/70B-v3/ggml-model-Q4_K_M.gguf Q4_K_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "30466fa3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Log start\n",
      "main: build = 2843 (e8496488)\n",
      "main: built with cc (Ubuntu 11.4.0-1ubuntu1~22.04) 11.4.0 for x86_64-linux-gnu\n",
      "main: seed  = 0\n",
      "llama_model_loader: loaded meta data with 21 key-value pairs and 291 tensors from ./models/8B-v3/ggml-model-Q4_K_M.gguf (version GGUF V3 (latest))\n",
      "llama_model_loader: Dumping metadata keys/values. Note: KV overrides do not apply in this output.\n",
      "llama_model_loader: - kv   0:                       general.architecture str              = llama\n",
      "llama_model_loader: - kv   1:                               general.name str              = Meta-Llama-3-8B\n",
      "llama_model_loader: - kv   2:                          llama.block_count u32              = 32\n",
      "llama_model_loader: - kv   3:                       llama.context_length u32              = 8192\n",
      "llama_model_loader: - kv   4:                     llama.embedding_length u32              = 4096\n",
      "llama_model_loader: - kv   5:                  llama.feed_forward_length u32              = 14336\n",
      "llama_model_loader: - kv   6:                 llama.attention.head_count u32              = 32\n",
      "llama_model_loader: - kv   7:              llama.attention.head_count_kv u32              = 8\n",
      "llama_model_loader: - kv   8:                       llama.rope.freq_base f32              = 500000.000000\n",
      "llama_model_loader: - kv   9:     llama.attention.layer_norm_rms_epsilon f32              = 0.000010\n",
      "llama_model_loader: - kv  10:                          general.file_type u32              = 15\n",
      "llama_model_loader: - kv  11:                           llama.vocab_size u32              = 128256\n",
      "llama_model_loader: - kv  12:                 llama.rope.dimension_count u32              = 128\n",
      "llama_model_loader: - kv  13:                       tokenizer.ggml.model str              = gpt2\n",
      "llama_model_loader: - kv  14:                         tokenizer.ggml.pre str              = llama-bpe\n",
      "llama_model_loader: - kv  15:                      tokenizer.ggml.tokens arr[str,128256]  = [\"!\", \"\\\"\", \"#\", \"$\", \"%\", \"&\", \"'\", ...\n",
      "llama_model_loader: - kv  16:                  tokenizer.ggml.token_type arr[i32,128256]  = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...\n",
      "llama_model_loader: - kv  17:                      tokenizer.ggml.merges arr[str,280147]  = [\"Ġ Ġ\", \"Ġ ĠĠĠ\", \"ĠĠ ĠĠ\", \"...\n",
      "llama_model_loader: - kv  18:                tokenizer.ggml.bos_token_id u32              = 128000\n",
      "llama_model_loader: - kv  19:                tokenizer.ggml.eos_token_id u32              = 128001\n",
      "llama_model_loader: - kv  20:               general.quantization_version u32              = 2\n",
      "llama_model_loader: - type  f32:   65 tensors\n",
      "llama_model_loader: - type q4_K:  193 tensors\n",
      "llama_model_loader: - type q6_K:   33 tensors\n",
      "llm_load_vocab: special tokens definition check successful ( 256/128256 ).\n",
      "llm_load_print_meta: format           = GGUF V3 (latest)\n",
      "llm_load_print_meta: arch             = llama\n",
      "llm_load_print_meta: vocab type       = BPE\n",
      "llm_load_print_meta: n_vocab          = 128256\n",
      "llm_load_print_meta: n_merges         = 280147\n",
      "llm_load_print_meta: n_ctx_train      = 8192\n",
      "llm_load_print_meta: n_embd           = 4096\n",
      "llm_load_print_meta: n_head           = 32\n",
      "llm_load_print_meta: n_head_kv        = 8\n",
      "llm_load_print_meta: n_layer          = 32\n",
      "llm_load_print_meta: n_rot            = 128\n",
      "llm_load_print_meta: n_embd_head_k    = 128\n",
      "llm_load_print_meta: n_embd_head_v    = 128\n",
      "llm_load_print_meta: n_gqa            = 4\n",
      "llm_load_print_meta: n_embd_k_gqa     = 1024\n",
      "llm_load_print_meta: n_embd_v_gqa     = 1024\n",
      "llm_load_print_meta: f_norm_eps       = 0.0e+00\n",
      "llm_load_print_meta: f_norm_rms_eps   = 1.0e-05\n",
      "llm_load_print_meta: f_clamp_kqv      = 0.0e+00\n",
      "llm_load_print_meta: f_max_alibi_bias = 0.0e+00\n",
      "llm_load_print_meta: f_logit_scale    = 0.0e+00\n",
      "llm_load_print_meta: n_ff             = 14336\n",
      "llm_load_print_meta: n_expert         = 0\n",
      "llm_load_print_meta: n_expert_used    = 0\n",
      "llm_load_print_meta: causal attn      = 1\n",
      "llm_load_print_meta: pooling type     = 0\n",
      "llm_load_print_meta: rope type        = 0\n",
      "llm_load_print_meta: rope scaling     = linear\n",
      "llm_load_print_meta: freq_base_train  = 500000.0\n",
      "llm_load_print_meta: freq_scale_train = 1\n",
      "llm_load_print_meta: n_yarn_orig_ctx  = 8192\n",
      "llm_load_print_meta: rope_finetuned   = unknown\n",
      "llm_load_print_meta: ssm_d_conv       = 0\n",
      "llm_load_print_meta: ssm_d_inner      = 0\n",
      "llm_load_print_meta: ssm_d_state      = 0\n",
      "llm_load_print_meta: ssm_dt_rank      = 0\n",
      "llm_load_print_meta: model type       = 8B\n",
      "llm_load_print_meta: model ftype      = Q4_K - Medium\n",
      "llm_load_print_meta: model params     = 8.03 B\n",
      "llm_load_print_meta: model size       = 4.58 GiB (4.89 BPW) \n",
      "llm_load_print_meta: general.name     = Meta-Llama-3-8B\n",
      "llm_load_print_meta: BOS token        = 128000 '<|begin_of_text|>'\n",
      "llm_load_print_meta: EOS token        = 128001 '<|end_of_text|>'\n",
      "llm_load_print_meta: LF token         = 128 'Ä'\n",
      "llm_load_print_meta: EOT token        = 128009 '<|eot_id|>'\n",
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 1 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes\n",
      "llm_load_tensors: ggml ctx size =    0.30 MiB\n",
      "llm_load_tensors: offloading 32 repeating layers to GPU\n",
      "llm_load_tensors: offloading non-repeating layers to GPU\n",
      "llm_load_tensors: offloaded 33/33 layers to GPU\n",
      "llm_load_tensors:        CPU buffer size =   281.81 MiB\n",
      "llm_load_tensors:      CUDA0 buffer size =  4403.49 MiB\n",
      "........................................................................................\n",
      "llama_new_context_with_model: n_ctx      = 8192\n",
      "llama_new_context_with_model: n_batch    = 2048\n",
      "llama_new_context_with_model: n_ubatch   = 512\n",
      "llama_new_context_with_model: flash_attn = 0\n",
      "llama_new_context_with_model: freq_base  = 500000.0\n",
      "llama_new_context_with_model: freq_scale = 1\n",
      "llama_kv_cache_init:      CUDA0 KV buffer size =  1024.00 MiB\n",
      "llama_new_context_with_model: KV self size  = 1024.00 MiB, K (f16):  512.00 MiB, V (f16):  512.00 MiB\n",
      "llama_new_context_with_model:  CUDA_Host  output buffer size =     0.49 MiB\n",
      "llama_new_context_with_model:      CUDA0 compute buffer size =   560.00 MiB\n",
      "llama_new_context_with_model:  CUDA_Host compute buffer size =    24.01 MiB\n",
      "llama_new_context_with_model: graph nodes  = 1030\n",
      "llama_new_context_with_model: graph splits = 2\n",
      "\n",
      "system_info: n_threads = 64 / 128 | AVX = 1 | AVX_VNNI = 0 | AVX2 = 1 | AVX512 = 0 | AVX512_VBMI = 0 | AVX512_VNNI = 0 | FMA = 1 | NEON = 0 | ARM_FMA = 0 | F16C = 1 | FP16_VA = 0 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 1 | SSSE3 = 1 | VSX = 0 | MATMUL_INT8 = 0 | LLAMAFILE = 1 | \n",
      "sampling: \n",
      "\trepeat_last_n = 64, repeat_penalty = 1.100, frequency_penalty = 0.000, presence_penalty = 0.000\n",
      "\ttop_k = 40, tfs_z = 1.000, top_p = 0.950, min_p = 0.050, typical_p = 1.000, temp = 1.100\n",
      "\tmirostat = 0, mirostat_lr = 0.100, mirostat_ent = 5.000\n",
      "sampling order: \n",
      "CFG -> Penalties -> top_k -> tfs_z -> typical_p -> top_p -> min_p -> temperature \n",
      "generate: n_ctx = 8192, n_batch = 2048, n_predict = 1024, n_keep = 0\n",
      "\n",
      "\n",
      "\u001b[33m<|begin_of_text|> First Citizen:\n",
      "\n",
      " Before we proceed any further, hear me speak.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Speak, speak.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " You are all resolved rather to die than to famish?\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Resolved. resolved.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " First, you know Caius Marcius is chief enemy to the people.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " We know't, we know't.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " Let us kill him, and we'll have corn at our own price. Is't a verdict?\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " No more talking on't; let it be done: away, away!\n",
      "\n",
      " \n",
      "\n",
      " Second Citizen:\n",
      "\n",
      " One word, good citizens.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " We are accounted poor citizens, the patricians good. What authority surfeits on would relieve us: if they would yield us but the superfluity,  while it were wholesome, we might guess they relieved us humanely; but they think we are too dear: the leanness that afflicts us, the object of  our misery, is as an inventory to particularise their abundance; our sufferance is a gain to them Let us revenge this with our pikes,  ere we become rakes: for the gods know I speak this in hunger for bread, not in thirst for revenge.\n",
      "\n",
      " \n",
      "\n",
      " \u001b[0m2d Cit:\n",
      "\n",
      " If there were no more but the empty titles madam, then should you scrub the earth with hue and cry. No,  it is abusèd from a globe, finite and small; within which limits, all that we innaginate or believe or see as on a map doth shrink to this: small pattern of horrid shade--an inch of opportunist advantage; now proves itself upon our actions, let all the world resolve us wretches. I am as free as Caesar, but since Caesar is become so tyrannical, what should be in that 'Caesar' that we should not even die to rid him before himself?\n",
      "\n",
      " \n",
      "\n",
      " 3d Cit:\n",
      "\n",
      " There is nothing in this world capricious, cruel or unfeeling that would not relent with the slightest show of affection. So let us all resolve against Caesar and our own miseries at the same time.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " Let it be so: their resolution is not past; every man must die but that shall come the space wherein I will perform that comedy which I promised your  valors. The great man's the proper weapon of th' war.\n",
      "\n",
      " \n",
      "\n",
      " First Citizen:\n",
      "\n",
      " You have spoke it, there fore it needs must follow as a matter of fact, for otherwise your words would be deemed idle and of little substance. So let us go forth: by order of the people, I do pour my heart into this trumpet, for our mouths shall proclaim against Caesar our indignations.\n",
      "\n",
      " \n",
      "\n",
      " All:\n",
      "\n",
      " March.\n",
      "\n",
      " <|end_of_text|> [end of text]\n",
      "\n",
      "llama_print_timings:        load time =    1937.08 ms\n",
      "llama_print_timings:      sample time =     420.43 ms /   309 runs   (    1.36 ms per token,   734.96 tokens per second)\n",
      "llama_print_timings: prompt eval time =     124.33 ms /   258 tokens (    0.48 ms per token,  2075.11 tokens per second)\n",
      "llama_print_timings:        eval time =    2779.96 ms /   308 runs   (    9.03 ms per token,   110.79 tokens per second)\n",
      "llama_print_timings:       total time =    3652.01 ms /   566 tokens\n",
      "Log end\n"
     ]
    }
   ],
   "source": [
    "# Start inference on a gguf model (-h to show all options)\n",
    "!./main -ngl 10000 -m ./models/8B-v3/ggml-model-Q4_K_M.gguf --color --temp 1.1 --repeat_penalty 1.1 -c 0 -n 1024 -e -s 0 -p \"\"\"\\\n",
    "First Citizen:\\n\\n\\\n",
    "Before we proceed any further, hear me speak.\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "Speak, speak.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "You are all resolved rather to die than to famish?\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "Resolved. resolved.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "First, you know Caius Marcius is chief enemy to the people.\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "We know't, we know't.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "Let us kill him, and we'll have corn at our own price. Is't a verdict?\\n\\n\\\n",
    "\\n\\n\\\n",
    "All:\\n\\n\\\n",
    "No more talking on't; let it be done: away, away!\\n\\n\\\n",
    "\\n\\n\\\n",
    "Second Citizen:\\n\\n\\\n",
    "One word, good citizens.\\n\\n\\\n",
    "\\n\\n\\\n",
    "First Citizen:\\n\\n\\\n",
    "We are accounted poor citizens, the patricians good. What authority surfeits on would relieve us: if they would yield us but the superfluity, \\\n",
    "while it were wholesome, we might guess they relieved us humanely; but they think we are too dear: the leanness that afflicts us, the object of \\\n",
    "our misery, is as an inventory to particularise their abundance; our sufferance is a gain to them Let us revenge this with our pikes, \\\n",
    "ere we become rakes: for the gods know I speak this in hunger for bread, not in thirst for revenge.\\n\\n\\\n",
    "\\n\\n\\\n",
    "\"\"\"\n",
    "\n",
    "# # Chat template for Termianl (Use the instruction-tuned models to better follow the template)\n",
    "# !./main -ngl 10000 -m ./models/8B-v3-instruct/ggml-model-Q4_K_M.gguf --color -c 0 -n -2 -e -s 0 --mirostat 2 -i --no-display-prompt --keep -1 \\\n",
    "# -r '<|eot_id|>' -p '<|begin_of_text|><|start_header_id|>system<|end_header_id|>\\n\\nYou are a helpful assistant.<|eot_id|><|start_header_id|>user<|end_header_id|>\\n\\nHi!<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n' \\\n",
    "# --in-prefix '<|start_header_id|>user<|end_header_id|>\\n\\n' --in-suffix '<|eot_id|><|start_header_id|>assistant<|end_header_id|>\\n\\n'"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "17c21608",
   "metadata": {},
   "source": [
    "## Benchmarks"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "2b6aee37",
   "metadata": {},
   "source": [
    "### 8B Q4_K_M"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "996ee79e",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 1 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes\n",
      "| model                          |       size |     params | backend    | ngl |          test |              t/s |\n",
      "| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | ---------------: |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |         pp512 | 2978.34 ± 221.71 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |        pp1024 | 2866.67 ± 130.93 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |        pp4096 |   2382.22 ± 8.01 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |        pp8192 |  1919.78 ± 12.98 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |         tg512 |     89.67 ± 0.67 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |        tg1024 |     87.84 ± 0.76 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |        tg4096 |     80.85 ± 0.40 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |        tg8192 |     74.22 ± 0.35 |\n",
      "| llama 8B Q4_K - Medium         |   4.58 GiB |     8.03 B | CUDA       |  99 |   pp512+tg128 |    388.15 ± 1.32 |\n",
      "\n",
      "build: e8496488 (2843)\n"
     ]
    }
   ],
   "source": [
    "!./llama-bench -p 512,1024,4096,8192 -n 512,1024,4096,8192 -m ./models/8B-v3/ggml-model-Q4_K_M.gguf"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0f07d6cd",
   "metadata": {},
   "source": [
    "### 8B F16"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "cb0b80b3",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "ggml_cuda_init: GGML_CUDA_FORCE_MMQ:   no\n",
      "ggml_cuda_init: CUDA_USE_TENSOR_CORES: yes\n",
      "ggml_cuda_init: found 1 CUDA devices:\n",
      "  Device 0: NVIDIA GeForce RTX 3090, compute capability 8.6, VMM: yes\n",
      "| model                          |       size |     params | backend    | ngl |          test |              t/s |\n",
      "| ------------------------------ | ---------: | ---------: | ---------- | --: | ------------: | ---------------: |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |         pp512 | 3509.84 ± 331.76 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |        pp1024 | 3406.59 ± 145.43 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |        pp4096 |   2705.05 ± 7.78 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |        pp8192 |   2134.50 ± 6.18 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |         tg512 |     39.13 ± 0.45 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |        tg1024 |     40.63 ± 0.43 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |        tg4096 |     39.44 ± 0.21 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |        tg8192 |     37.88 ± 0.15 |\n",
      "| llama 8B F16                   |  14.96 GiB |     8.03 B | CUDA       |  99 |   pp512+tg128 |    194.84 ± 0.33 |\n",
      "\n",
      "build: e8496488 (2843)\n"
     ]
    }
   ],
   "source": [
    "!./llama-bench -p 512,1024,4096,8192 -n 512,1024,4096,8192 -m ./models/8B-v3/ggml-model-f16.gguf"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.12"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}
